# Libraries
#install.packages("heatmaply")
#install.packages("tidyverse")
### Setting up environment and loading data
#
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
library(ggplot2)
library(knitr)
library(scales)
library(gtools)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(png)
library(circlize)
library(ggpubr)
## Loading required package: magrittr
## color palette
library(grDevices)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(RColorBrewer)
### Loading data
bap <- read.csv2("../data/KP_screening_data.csv", header = TRUE, sep = ",", na.strings = "", stringsAsFactors = FALSE)
### Summary
#summary(bap)
Exploring further: distribution of major resistance determinants identified Data presented below focuses on the top 50 resistance genes identified in the collection.
### generate resistance matrix from BAP profile data and merge existing tables
## count most frequent resistance genes (can't use count() on single column)
# split data by comma and put in a unique list then convert to dataframe
# add column name
# arrange by descending values of frequency
resdata <- data.frame(table(unlist(strsplit(bap$resistance_genes, ','))))
names(resdata)[1] = 'resistance_gene'
resdata <- arrange(resdata,desc(Freq))
# extract top 50 as a character vector and sort alphabetically
#resdata <- filter(resdata, Freq>1)
resgenes <- as.character(resdata$resistance_gene[0:50]) ### <- change here to increase cut-off
resgenes <- sort(resgenes)
# display top frequent alleles found
resdata[0:ncol(resdata),]
## resistance_gene Freq
## 1 fosA 1550
## 2 oqxB 1491
# generate empty res matrix
res_mat <- data.frame(matrix(nrow = nrow(bap), ncol = (length(resgenes)+1)), stringsAsFactors = FALSE)
colnames(res_mat) <- (c("Id", resgenes))
# add Id from bap table
res_mat$Id <- bap$Id
# populate matrix
for (resgene in resgenes) {
# record position of resgene to assign column properly in res_mat
pos = match(resgene,resgenes)
for (i in 1:nrow(bap)) {
reslist <- unlist(strsplit(bap$resistance_genes[i], ','))
occ <- 0
# check if reslist contains "NA"
if (all(is.na(reslist))) {
### populate matrix with occurence results
res_mat[i, pos+1] <- occ
}
else {
for (j in 1:length(reslist)) {
# check and count occurences of resgene in reslist
if (resgene == reslist[[j]]) {
occ <- occ + 1
j <- j + 1
}
else {
j <- j + 1
}
### populate matrix with occurence results
res_mat[i, pos+1] <- occ
}
}
}
}
### display freqs
# colSums(res_mat[2:ncol(res_mat)])
### merge data to main matrix
bapmatres <- full_join(bap, res_mat, by = "Id")
### export table for visualisation purpose
# out <- res_mat
# out <- as.matrix(out)
# rownames(out) <- out[, 1] ## add name
# test <- test[, -1] ## remove 1st column
# col.order <- hclust(dist(t(out)))$order ## reorder columns using hierarchical clustering
# out2 <- out[, col.order]
# write.csv(out,"reorder_res_mat.csv")
res_mat_ST <- merge(x = res_mat, y = bap[, c("Id", "ST")], by = "Id", all.x=TRUE)
# replace empty fields by NA
res_mat_ST$ST <- sub("^$", NA, res_mat_ST$ST)
# convert as.numeric to sort by ST
res_mat_ST$ST <- as.numeric(res_mat_ST$ST)
## Warning: NAs introduced by coercion
res_mat_ST <- res_mat_ST[order(res_mat_ST$ST), ]
# calculate sum by ST for each column (resgene) and add 1st column unique ST ID)
res_freq_ST <- cbind(unique(res_mat_ST$ST),ddply(res_mat_ST[, 2:(ncol(res_mat_ST))], "ST", colSums))
# remove last column which contains sum(ST)
res_freq_ST <- res_freq_ST[,1:(ncol(res_freq_ST)-1)]
colnames(res_freq_ST)[1] <- "ST"
# build frequences data for Res_Type by ST
freqpairs <- melt(res_freq_ST, "ST")
freqpairs$ST <- as.character(freqpairs$ST)
ggplot(freqpairs, aes(x = ST, y = value, fill = variable)) + geom_col(position = position_stack(reverse = TRUE), colour="black", size=0.2) + theme(axis.text.x=element_text(angle=60, hjust=1)) + ggtitle("Distribution of most frequent alleles by ST")
# build frequences data for Type_family by Subgroup
freqpairs <- melt(res_freq_ST, "ST")
freqpairs$ST[is.na(freqpairs$ST)] <- "NT"
#freqpairs <- filter(freqpairs, freqpairs$value >2)
significant <- c('NT', '101', '11', '14', '147', '278', '280', '15', '16', '17', '120', '200', '231', '258', '127', '273', '277', '29', '278', '3', '336', '37', '392', '442', '512', '873')
freqpairs <- filter(freqpairs, ST %in% significant) %>% na.omit()
##############
circos.clear()
circos.par(gap.after = c(rep(1, length(unique(freqpairs[[1]]))-1), 8, rep(1, length(unique(freqpairs[[2]]))-1), 8))
grid.col = c(NT = "grey", "258" = "red")
#circos.par(start.degree = 270, clock.wise = FALSE)
chordDiagram(freqpairs,
annotationTrack = c("grid", "axis"),
preAllocateTracks = list(track.height = 0.1),
link.sort = TRUE, link.decreasing = TRUE,
grid.col = grid.col,
#directional = -1, direction.type = c("diffHeight", "arrows"), link.arr.type = "big.arrow",
order = c((mixedsort(unique(freqpairs[[1]]), decreasing = TRUE)), as.character(mixedsort(unique(freqpairs[[2]])))))
circos.trackPlotRegion(track.index = 1, panel.fun = function(x, y) {
xlim = get.cell.meta.data("xlim")
xplot = get.cell.meta.data("xplot")
ylim = get.cell.meta.data("ylim")
sector.name = get.cell.meta.data("sector.index")
if(abs(xplot[2] - xplot[1]) < 20) {
circos.text(mean(xlim), ylim[1], sector.name, facing = "clockwise",
niceFacing = TRUE, adj = c(0, 0.5))
} else {
circos.text(mean(xlim), ylim[1], sector.name, facing = "inside",
niceFacing = TRUE, adj = c(0.5, 0))
}
}, bg.border = NA)
circos.clear()
freqpairs2 <- filter(freqpairs, ST %in% significant) %>% na.omit()
ggplot(freqpairs2, aes(ST, variable)) + geom_tile(aes(fill = value), colour = "white") + scale_fill_gradient(low = "lightblue", high = "red") + geom_text(aes(label = value)) + ggtitle("Distribution of res genes types by ST")
### normalised by column
freqpairs2_rescale <- ddply(freqpairs2, .(ST), transform, rescale = rescale(value))
ggplot(freqpairs2_rescale, aes(ST, variable)) + geom_tile(aes(fill = rescale), colour = "white") + scale_fill_gradient(low = "lightblue", high = "red") + geom_text(aes(label = value)) + ggtitle("Distribution of res genes types by ST, rescaled by column")
### Sorting works but ggplot figure doesn't use the correct order, although df_molten_dat seems correct
heat <- dcast(data = freqpairs2, formula = ST~variable, fun.aggregate = sum, value.var = "value")
heat[is.na(heat)] <- 0
rownames(heat) <- heat[,1]
heat2 <- heat[,-1]
## reorder col and rows
row.order <- hclust(dist(heat2[,-1]))$order # clustering
col.order <- hclust(dist(t(heat2[,-1])))$order
dat_new <- heat2[row.order, col.order] # re-order matrix accoring to clustering
df_molten_dat <- melt(as.matrix(dat_new)) # reshape into dataframe
names(df_molten_dat)[c(1:2)] <- c("ST", "variable")
df_molten_dat[df_molten_dat < 1] <- NA # transform back 0 into NA
## Warning in Ops.factor(left, right): '<' not meaningful for factors
## Warning in Ops.factor(left, right): '<' not meaningful for factors
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
breaks = seq(0, max(heat2),length.out=1000)
gradient1 = colorpanel( sum( breaks[-1]<1), "white", "white")
gradient2 = colorpanel( sum( breaks[-1]>=1 ), "lightblue", "red" )
hmcol2 = c(gradient1,gradient2)
# default clustering is set for hclust (complete)
heatmap(as.matrix(t(heat2)), scale="none", col = hmcol2)
title(main="Heatmap of antibiotic genes by ST", sub = "Frequencies, complete clustering", xlab = "ST", ylab = "Resistance gene")
# default clustering is set for hclust (complete)
heatmap(as.matrix(t(heat2)), scale="column", col = hmcol2)
title(main="Heatmap of antibiotic genes by ST, scaled by column", sub = "Frequencies, complete clustering", xlab = "ST", ylab = "Resistance gene")
Supplemental figure S11
res_mat_ST <- merge(x = res_mat, y = bap[, c("Id", "ST")], by = "Id", all.x=TRUE)
res_mat_ST_porine <- merge(x = res_mat_ST, y = bap[, c("Id", "Insertion", "ompK36", "ompK35")], by = "Id", all.x=TRUE)
collec <- cbind(res_mat_ST_porine[,(ncol(res_mat_ST_porine)-3):ncol(res_mat_ST_porine)],"total" = rowSums(res_mat_ST_porine[2:(ncol(res_mat_ST_porine)-4)]))
significant <- c('-', '11', '14', '16', '23', '37', '101', '147','231', '258', '437', '442', '512', '525')
collec_sig <- filter(collec, ST %in% significant) #%>% na.omit()
## requires ggpubr
collec_sig[is.na(collec_sig)] <- "-"
#compare_means(total ~ ST, data = collec_sig, group.by = "Insertion", method = "kruskal.test")
# boxplots
ggplot(collec_sig, aes(y = total, x = Insertion )) + geom_jitter(aes(alpha=1, colour = factor(ompK35))) + geom_boxplot(aes(fill = Insertion)) + scale_fill_manual(values=c("grey", "red", "dark red")) + facet_grid(. ~ ST) + theme(axis.text.x=element_text(angle=60, hjust=1)) + ggtitle("Nb of antibiotic resistance gene by ST harbouring ompK36 mutants and insertion type") + stat_compare_means(label = "p.signif", ref.group = "-")
## Warning: Computation failed in `stat_compare_means()`:
## Can't find specified reference group: 1. Allowed values include one of: 2
## Warning: Computation failed in `stat_compare_means()`:
## Can't find specified reference group: 1. Allowed values include one of: 2
Exploring further
# boxplots
ggplot(collec_sig, aes(y = total, x = factor(ompK35) )) + geom_jitter(aes(alpha=1, colour = factor(ompK35))) + geom_boxplot(aes(fill = factor(ompK35))) + facet_grid(. ~ ST) + theme(axis.text.x=element_text(angle=60, hjust=1)) + ggtitle("Nb of antibiotic resistance gene by major ST and presence of OmpK35") + stat_compare_means(label = "p.signif", ref.group = "1")
## Warning: Computation failed in `stat_compare_means()`:
## `group1`, `p` must resolve to integer column positions, not NULL
## Warning: Computation failed in `stat_compare_means()`:
## `group1`, `p` must resolve to integer column positions, not NULL
## Warning: Computation failed in `stat_compare_means()`:
## `group1`, `p` must resolve to integer column positions, not NULL
## Warning: Computation failed in `stat_compare_means()`:
## `group1`, `p` must resolve to integer column positions, not NULL
significant <- c('-', '101', '11', '14', '147', '278', '280', '15', '16', '17', '120', '200', '23', '231', '258', '127', '273', '277', '29', '278', '3', '336', '37', '392', '442', '512', '873')
### blaKPC gene by major ST and insertion type
subset <- cbind(res_mat_ST_porine$`blaKPC-2`, res_mat_ST_porine$`blaKPC-3`, res_mat_ST_porine$`blaKPC-4`, res_mat_ST_porine$`blaKPC-6`, res_mat_ST_porine$`blaKPC-9`)
collec <- cbind(res_mat_ST_porine[,(ncol(res_mat_ST_porine)-3):ncol(res_mat_ST_porine)],"total" = rowSums(subset))
collec_sig <- filter(collec, ST %in% significant) #%>% na.omit()
collec_sig_summary <- filter(as.data.frame(with(collec_sig, table(ST, Insertion, ompK36, ompK35, total))), Freq > 0)
# filter by freq cut-off of unique occurence (ST-Country-Year)
p1 <- ggplot(collec_sig_summary, aes(y = total, x = Insertion )) + geom_jitter(aes(size= Freq, alpha=1, colour = factor(ompK35)), width = 0.25, height = 0.1) + facet_grid(. ~ ST) + theme(axis.text.x=element_text(angle=60, hjust=1)) + ggtitle("Strains with blaKPC genes by major ST, ompK36 presence and insertion type") + scale_size_area(max_size = 13)
### blaCTX-M-15 gene by major ST and insertion type
subset <- cbind(res_mat_ST_porine$`blaCTX-M-15`)
collec <- cbind(res_mat_ST_porine[,(ncol(res_mat_ST_porine)-3):ncol(res_mat_ST_porine)],"total" = rowSums(subset))
collec_sig <- filter(collec, ST %in% significant) #%>% na.omit()
collec_sig_summary <- filter(as.data.frame(with(collec_sig, table(ST, Insertion, ompK36, ompK35, total))), Freq > 0)
# filter by freq cut-off of unique occurence (ST-Country-Year)
p2 <- ggplot(collec_sig_summary, aes(y = total, x = Insertion )) + geom_jitter(aes(size= Freq, alpha=1, colour = factor(ompK35)), width = 0.25, height = 0.1) + facet_grid(. ~ ST) + theme(axis.text.x=element_text(angle=60, hjust=1)) + ggtitle("Strains with blaCTX-M-15 genes by major ST, ompK36 presence and insertion type") + scale_size_area(max_size = 13)
## plotting
grid.arrange(p1,p2,ncol=1)